Data Preprocessing

library(ggplot2)
library (readr)
library(tidyverse)
library(dplyr)
library(covidcast)
library(lubridate)
library(ggpubr)
library(reshape2)
library(tidyr)
library(viridis)
library(gridExtra)
library(zoo)
library(cowplot)
library(gplots)
library(car)
library(nortest)
library(MASS)

source("code/painter.r")
source("code/load_all_data.r")
source("code/shifter.r")
STARTDATE <- "2020-02-20"
ENDDATE <- lubridate::today()
GEO_TYPE = "state" # state-level
GEO_VALUE = "*" # all states
EXCLUDED_AREAS = c("as","gu", "mp","vi") # excluded areas due to small sample size
# Load the covidcast signal data
data <- load_covidcast_data(STARTDATE, ENDDATE, GEO_TYPE, GEO_VALUE, EXCLUDED_AREAS)

# Load the policy data
policy_signal <- load_policy_data(STARTDATE, ENDDATE)
# Compute the average on a 7day sliding window
policy_signal <-policy_signal %>%
    arrange(desc(geo_value)) %>% 
    group_by(geo_value) %>% 
    mutate(num.policy.7avg = rollmean(total.num.policy, k = 7, fill = NA))%>%
    ungroup()

# Finalize the covidcast-like signal for governemnt intervention
covidcast.like.policy.signal <- policy_signal %>% transmute(
  geo_value = geo_value,
  signal = "policy_7dav_num",
  time_value = time_value,
  direction = NA,
  issue = lubridate::today(),
  lag = issue - time_value,
  value = num.policy.7avg,
  stderr = NA,
  sample_size = NA,
  data_source = 'University of Washington')
# We fist get all the signals
ftime.mobility <- data$Full.Time.Mobility[c("time_value","geo_value","value")] #mobility

case.count <- data$Avg.Confirmed.Case.Count[c("time_value","geo_value","value")]# case count

cumulative.case.count <-data$Cum.Avg.Case.Count[c("time_value","geo_value","value")]# cumulative case count

cumulative.case.count.prop <-data$Cum.Avg.Case.Count.Prop[c("time_value","geo_value","value")] # cumulative case count per 100,000

avg.death.count <- data$Avg.Death.Case.Count[c("time_value","geo_value","value")] #death count

cum.avg.death.count <- data$Cum.Avg.Death.Count[c("time_value","geo_value","value")] #cumulative death count

doc.visit <- data$smoothed_cli[c("time_value","geo_value","value")] # doctor visit

adj.doc.visit <- data$smoothed_adj_cli[c("time_value","geo_value","value")] # adjusted doctor visit

# Change the column name
colnames(ftime.mobility)[3]<-"full_time_work_prop"
colnames(case.count)[3] <- "confirmed_7dav_incidence_num"
colnames(cumulative.case.count)[3] <- "confirmed_7dav_cumulative"
colnames(cumulative.case.count.prop)[3] <- "confirmed_7dav_cumulative_prop"
colnames(avg.death.count)[3]<-"deaths_7dav_incidence_num"
colnames(cum.avg.death.count)[3] <- "deaths_7dav_cumulative_num"
colnames(doc.visit)[3] <- "smoothed_cli"
colnames(adj.doc.visit)[3] <- "smoothed_adj_cli"

# Create a list of confounders for left join with mobility
confounders <- list(case.count, cumulative.case.count, cumulative.case.count.prop,
     avg.death.count, cum.avg.death.count, doc.visit, adj.doc.visit)

# Turn the polical signal to be factors
factored.policy.signal <- cbind(policy_signal[1:2], lapply(policy_signal[3:17], as.factor),policy_signal[18:19])

# Intervention left join mobility with policy signal
intervention_mobility_case <- left_join(ftime.mobility,factored.policy.signal , by=c("time_value", "geo_value"))

# Left join again with all other potential confounders
for (confounder in confounders){
  intervention_mobility_case <- left_join(intervention_mobility_case, confounder, by=c("time_value", "geo_value"))
}
# Filter state "pr" as it is not available in the intervention data
factored_data <- intervention_mobility_case %>% filter(!(geo_value %in% c("pr")))

Treatment effect of government interventions

Moreover, we may be interested to know the following:

  • Is there a difference in the means of mobility signal across states?

  • Is there a difference in the means of mobility signal in terms of stay-at-home order?

  • Is there an interaction between the factor of states and the factor of the intervention?

To answer these questions, assuming that the data are normally distributed and the variance across groups are homogeneous, we will use ANOVA (Analysis of Variance). We will check these assumptions in the model diagnostics.

Main effect of States

We can see that some states have a particularly higher range of the mean mobility signal. For example, Montana (MT) clearly stands out from the rest, whereas Hawaii (HI) has a much lower mean mobility signal from Feb. to Sep. in 2020.

# Plotting the main effect of geo_value
plotmeans(full_time_work_prop~ geo_value,data=intervention_mobility_case, xlab="Geo_value", ylab="Mobility", main="Main effect (States)") 

Interaction plots

We can also look at the interaction plots to visually examine whether the effect that an intervention has on the mean mobility signal is independent of the state.

We notice the following interventions may have statistically siginficant interaction with the factor of states on the mean mobility signal:

  • Gathering Reommendation (Recommendation of against gathering that stops short of a formal mandate or restriction of gatherings) , public mask, quarantine.
for(i in 4:17){
  interaction.plot(factored_data[,"geo_value"], factored_data[,names(factored_data)[i]], factored_data$full_time_work_prop, xlab="States", ylab= "Full time work prop", main=paste("Interaction plot between states and", names(factored_data)[i], "on mobility"), trace.label = names(factored_data)[i])  
}

Boxplots of mobility across different levels of intervention

From the boxplots below, we can see there are some obvious difference between having or not having certain interventions on the mobility signals. We will examine whether these differences are statistically significant.

The noteworthy interventions are:

  • Emergency declaration, gathering restriction, restaurant restriction, school closure, bar restriction, other buseinss closure.
# plot the boplots for categoical variables columns
p <- list()
counter <- 1
# loop through all categorical variables for intervention
for (i in 4:17){
p[[counter]]<- ggplot(factored_data, aes_string(x=names(factored_data)[3], y=names(factored_data)[i])) + 
    geom_boxplot() + 
     stat_summary(fun = mean, geom = "errorbar", aes(xmax = ..x.., xmin = ..x..), color ="red", linetype = 2)+
  labs(title=paste("Mobility by", names(factored_data)[i]), x="Mobility", y=names(factored_data)[i]) + theme(plot.title = element_text(size=9))

counter <- counter + 1
}
# Plot all the ggplot
do.call(grid.arrange,p)

Distribution of mobility by various intervention across states

We may also be interested to know if there are some states siginficantly contribute to the difference between having and not having certain interventions on the mobility.

For emergency declaration, all the states tend to have have lower mobility signal when it is implemented. Some states such as New Jersey, lowa, New York, have a clear large difference in mobility between having and not having emergency declaration.

Also, for the school closure policy, the mobility in Mississippi remains quite strong as compared to not having the school closure policy.

Regarding gathering restriction, restaurant restriction, and bar restriction, there isn’t any particular state standing out in terms of the difference of mobility between not having and having the intervention.

# grouped boxplot
for (i in 4:17){
  
p <- ggplot(factored_data, aes_string(x=names(factored_data)[3], y=names(factored_data)[2], fill=names(factored_data)[i])) + 
    geom_boxplot() +
  labs(title=paste("Mobility by State and", names(factored_data)[i]),x="mobility", y="State")

print(p)
}

Transformations of the Response Variable

Distribution of mobility signal is right-skewed, so we may want to consider a transformation to make it more normal like. The square-root transformation seems to work best in this case.

par(mfrow=c(2,2))
hist(factored_data$full_time_work_prop, main="Histogram of unchanged response", xlab="full_time_work_prop")
hist(log(factored_data$full_time_work_prop), main="Histogram of log response", xlab="full_time_work_prop")
hist(1/(factored_data$full_time_work_prop), main="Histogram of 1/response", xlab="full_time_work_prop")
hist(sqrt(factored_data$full_time_work_prop), main="Histogram of squared root of response", xlab="full_time_work_prop")

Histograms of all continuous variables

We also would like to explore some predictor variables.

par(mfrow=c(3,3))
for (i in 19:ncol(factored_data)) 
{
  hist(factored_data[,names(factored_data)[i]], xlab=names(factored_data)[i], main=paste("Histogram of", names(factored_data)[i]))
}

par(mfrow=c(1,1))
# select best lag number based on intial exploration
case.shifted.days.spearman <- 25 # based spearman correlation
doc.visit.shifted.days.spearman <- 40 # based spearman correlation for doctor visit
cum.death.shifted.days.spearman <- 25
death.shifted.days.spearman <- 50

case.vec <- c("confirmed_7dav_incidence_num", "confirmed_7dav_cumulative", "confirmed_7dav_cumulative_prop")
doc.vec <- c("smoothed_cli", "smoothed_adj_cli")
cum.death.vec <- c("deaths_7dav_cumulative_num")
death.vec <- c("deaths_7dav_incidence_num")

for (state in unique(factored_data$geo_value)){
  
  # Filter dataframe by every state
  selected.df <- factored_data %>% filter(geo_value==state)
  
  # Shift the case count column vector by the specified shift time
  factored_data <- shiftDays(selected.df, factored_data, case.shifted.days.spearman, case.vec)
  # Shift the  doctor column vector by the specified shift time
  factored_data <-shiftDays(selected.df, factored_data, doc.visit.shifted.days.spearman, doc.vec)
  
  # Shift the death count column vector by the specified shift time
  factored_data <-shiftDays(selected.df, factored_data, cum.death.shifted.days.spearman, cum.death.vec)
  # Shift the cum death count column vector by the specified shift time
  factored_data <-shiftDays(selected.df, factored_data, death.shifted.days.spearman, death.vec)
}
factored_data$geo_value <- as.factor(factored_data$geo_value)

Linear regression models comparsion (Mobility, Intervention, Confounders)

We modified the data so that the confounders are shifted n days forwarded in time based on rank correlation in our initial exploratary data analysis.

We shifted mobility 25 days ahead for confirmed case count signal, 40 days ahead for doctor visit, and 50 days ahead for death case count.

After shifting the covariates, we also drop some data. The number of samples goes from 12597 to 10914.

We also transform the reponse by taking a square-root of it to satisfy the normality assumption.

#Problematic: EmergDec, deaths_7dav_incidence_num

# model 1 : y is mobility, x's are the confounders and intervention
# Note: we dropped the geo and time value

lm.fit.1 <- lm(full_time_work_prop ~., data=factored_data[,-c(1,24)], na.action = na.exclude)
# Plot boxcox to see which transformation we should use
boxcox(lm.fit.1)

transformed.lm.fit.1<- lm(full_time_work_prop^(1/2) ~., data=factored_data[,-c(1,24)], na.action = na.exclude)

par(mfrow=c(2,2))
plot(lm.fit.1, which=1, main="Residuals vs Fitted (before transformation)") # residuals vs. fitted shows nonlinearity and nonconstant variance
plot(lm.fit.1, which=2, main="Residuals vs Fitted (before transformation)") # heavy tail

plot(transformed.lm.fit.1, which=1, main="Residuals vs Fitted (after transformation)") # residuals vs. fitted shows nonlinearity and nonconstant variance
plot(transformed.lm.fit.1, which=2, main="Normal (after transformation)") # residuals vs. fitted shows nonlinearity and nonconstant variance

par(mfrow=c(1,1))

# Plot the resulting mobility signal over time
plot(factored_data$time_value, factored_data$full_time_work_prop^(1/2), main="Square-root mobility by time", ylab="Square-root mobility", xlab="time")

Model Selection in multiple regression

When we have a lot of predictor variables, instead of using them all, we would prefer choosing a few subsets of them that give us the best models based on a certain criterion.We can also try to find the “best model” sequentially using stepwise regression procedures.

Observations

First, when we focus on finding the “best” additive model, we find the model with the following covariates based on BIC:

  • confirmed_7dav_cumulative_prop
  • geo_value
  • smoothed_cli
  • smoothed_adj_cli
  • SchoolClose
  • confirmed_7dav_incidence_num
  • confirmed_7dav_cumulative
  • TravelRestrictExit
  • GathRestrict

Note that from the summary, among all covidcast signals, only smoothed_cliand confirmed_7dav_cumulative have negative coefficient, whereas all the selected interventions (i.e. SchoolClose, TravelRestrictExit, GathRestrict) have negative coefficients.

The reason why we are particularly interested in negative coefficients because we want to compare the effects of interventions and potential confounders on reducing mobility.

Later, we can also look at the ANOVA model to see if these variables account for the variation of mobility significantly.

# We look at the summary of the selected model
fin_mod <- lm(formula = full_time_work_prop^(1/2) ~ confirmed_7dav_cumulative_prop + smoothed_cli + smoothed_adj_cli + SchoolClose + confirmed_7dav_incidence_num + confirmed_7dav_cumulative + TravelRestrictExit + GathRestrict + geo_value , data = na.omit(factored_data[, -c(1, 24)]))

# Show the summary
summary(fin_mod)
## 
## Call:
## lm(formula = full_time_work_prop^(1/2) ~ confirmed_7dav_cumulative_prop + 
##     smoothed_cli + smoothed_adj_cli + SchoolClose + confirmed_7dav_incidence_num + 
##     confirmed_7dav_cumulative + TravelRestrictExit + GathRestrict + 
##     geo_value, data = na.omit(factored_data[, -c(1, 24)]))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.102023 -0.014769  0.003076  0.015791  0.089222 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     2.184e-01  2.410e-03  90.608  < 2e-16 ***
## confirmed_7dav_cumulative_prop  1.770e-05  6.522e-07  27.130  < 2e-16 ***
## smoothed_cli                   -2.260e-02  3.578e-04 -63.184  < 2e-16 ***
## smoothed_adj_cli                2.358e-02  4.007e-04  58.856  < 2e-16 ***
## SchoolClose1                   -1.114e-02  7.169e-04 -15.535  < 2e-16 ***
## confirmed_7dav_incidence_num    1.750e-06  2.974e-07   5.883 4.15e-09 ***
## confirmed_7dav_cumulative      -2.190e-08  4.724e-09  -4.637 3.58e-06 ***
## TravelRestrictExit1            -2.051e-02  4.519e-03  -4.540 5.70e-06 ***
## GathRestrict1                  -4.071e-03  1.606e-03  -2.535  0.01126 *  
## geo_valueal                    -1.192e-02  2.465e-03  -4.836 1.34e-06 ***
## geo_valuear                    -1.019e-02  2.470e-03  -4.125 3.73e-05 ***
## geo_valueaz                    -2.000e-02  2.523e-03  -7.928 2.45e-15 ***
## geo_valueca                    -1.373e-03  2.797e-03  -0.491  0.62348    
## geo_valueco                     2.151e-03  2.462e-03   0.874  0.38215    
## geo_valuect                    -1.370e-02  2.491e-03  -5.499 3.91e-08 ***
## geo_valuedc                    -3.752e-02  2.553e-03 -14.698  < 2e-16 ***
## geo_valuede                    -4.637e-02  2.525e-03 -18.368  < 2e-16 ***
## geo_valuefl                    -2.611e-02  2.859e-03  -9.133  < 2e-16 ***
## geo_valuega                    -2.528e-02  2.488e-03 -10.162  < 2e-16 ***
## geo_valuehi                    -4.446e-02  2.524e-03 -17.616  < 2e-16 ***
## geo_valueia                    -5.611e-03  2.522e-03  -2.225  0.02612 *  
## geo_valueid                    -1.021e-02  2.451e-03  -4.164 3.15e-05 ***
## geo_valueil                    -2.277e-02  2.490e-03  -9.145  < 2e-16 ***
## geo_valuein                    -1.293e-02  2.457e-03  -5.262 1.45e-07 ***
## geo_valueks                    -1.707e-02  2.450e-03  -6.967 3.43e-12 ***
## geo_valueky                     1.017e-03  2.630e-03   0.387  0.69899    
## geo_valuela                    -1.947e-02  2.497e-03  -7.795 7.03e-15 ***
## geo_valuema                    -2.686e-02  2.498e-03 -10.753  < 2e-16 ***
## geo_valuemd                    -2.668e-02  2.472e-03 -10.794  < 2e-16 ***
## geo_valueme                     9.155e-06  2.454e-03   0.004  0.99702    
## geo_valuemi                    -1.580e-02  2.460e-03  -6.422 1.40e-10 ***
## geo_valuemn                    -1.704e-02  2.456e-03  -6.936 4.27e-12 ***
## geo_valuemo                    -4.101e-03  2.800e-03  -1.465  0.14298    
## geo_valuems                    -1.328e-02  2.500e-03  -5.312 1.11e-07 ***
## geo_valuemt                     1.171e-03  2.693e-03   0.435  0.66375    
## geo_valuenc                    -1.274e-02  2.501e-03  -5.093 3.58e-07 ***
## geo_valuend                    -2.574e-02  2.928e-03  -8.793  < 2e-16 ***
## geo_valuene                    -3.706e-03  2.469e-03  -1.501  0.13335    
## geo_valuenh                    -1.219e-02  2.444e-03  -4.989 6.17e-07 ***
## geo_valuenj                    -3.810e-02  2.541e-03 -14.995  < 2e-16 ***
## geo_valuenm                     4.271e-03  2.493e-03   1.713  0.08669 .  
## geo_valuenv                    -2.252e-02  2.459e-03  -9.158  < 2e-16 ***
## geo_valueny                    -2.553e-02  2.712e-03  -9.412  < 2e-16 ***
## geo_valueoh                    -1.241e-02  2.455e-03  -5.053 4.43e-07 ***
## geo_valueok                    -1.027e-02  2.451e-03  -4.190 2.81e-05 ***
## geo_valueor                    -8.815e-03  2.449e-03  -3.600  0.00032 ***
## geo_valuepa                    -1.506e-02  2.454e-03  -6.139 8.58e-10 ***
## geo_valueri                    -1.963e-02  2.522e-03  -7.786 7.58e-15 ***
## geo_valuesc                    -1.193e-02  2.511e-03  -4.752 2.04e-06 ***
## geo_valuesd                    -2.358e-02  2.836e-03  -8.314  < 2e-16 ***
## geo_valuetn                    -1.497e-02  2.473e-03  -6.051 1.49e-09 ***
## geo_valuetx                    -2.893e-02  2.667e-03 -10.849  < 2e-16 ***
## geo_valueut                    -3.218e-03  2.452e-03  -1.312  0.18947    
## geo_valueva                    -1.863e-02  2.455e-03  -7.587 3.54e-14 ***
## geo_valuevt                     2.309e-03  2.440e-03   0.947  0.34390    
## geo_valuewa                    -1.940e-02  2.447e-03  -7.926 2.48e-15 ***
## geo_valuewi                    -1.140e-02  2.455e-03  -4.644 3.46e-06 ***
## geo_valuewv                    -6.089e-03  2.454e-03  -2.481  0.01311 *  
## geo_valuewy                    -1.119e-02  2.441e-03  -4.585 4.60e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02476 on 10498 degrees of freedom
## Multiple R-squared:  0.4358, Adjusted R-squared:  0.4327 
## F-statistic: 139.8 on 58 and 10498 DF,  p-value: < 2.2e-16

Weekend effects

We suspect that the mobility signal is lower than usual during the weekend.

factored_data$weekday <- weekdays(as.Date(factored_data$time_value)) 

p <- ggplot(factored_data, aes(x=weekday, y=full_time_work_prop)) + 
  geom_boxplot()
p

# Drop the weekend
factored_data_without_wd <- factored_data%>% 
  mutate(weekday= weekdays(as.Date(time_value)))%>% 
  filter(!weekday %in% c("Saturday", "Sunday")) 

After we dropped the weekends in the data, we repeat the same process like we did before. By the box cox procedure, we should take the log transformation. We now have selected the following covariates to be included in the new model:

  • confirmed_7dav_cumulative_prop
  • geo_value
  • weekday
  • SchoolClose
  • confirmed_7dav_cumulative
  • GathRestrict
  • PublicMask
  • TravelRestrictExit
  • num.policy.7avg
  • TravelRestrictEntry
mod_without_end <- lm(full_time_work_prop~., data=na.omit(factored_data_without_wd[,-c(1,24)]))
boxcox(mod_without_end)

log.mod_without_end <- lm(log(full_time_work_prop)~., data=na.omit(factored_data_without_wd[,-c(1,24)]))

# plot the response
# Plot the resulting mobility signal over time

plot(factored_data$time_value, factored_data$full_time_work_prop^(1/2), main="Square-root mobility by time (with weekends)", ylab="Square-root mobility", xlab="time")

# Plot the resulting mobility signal over time
plot(factored_data_without_wd$time_value, log(factored_data_without_wd$full_time_work_prop), xlab="Time", ylab = "Full-time away home signal", main="Log mobility by time (without weekends)")

We now fit the data without weekend with another “best model” selected by stepwise regression to see if we can get a better fit.

Observations

Indeed, we get a higher adj. R-squared of 0.638, whereas the previous model has adj. R-squared of 0.4271. Also, note that this model doesn’t contain any confounding variable (from covidcast) that has negative coefficient, whereas all the interventions (i.e. SchoolClose, GathRestrict, TravelRestrict, TravelRestrictEntry, TravelRestrictExit) have negative coefficients except for PublicMask.

Note that SchoolClose, TravelRestrictExit, and GathRestrict also appear significantly in the first model with weekend data.

# We look at the summary of the selected model
fin_mod_without_end <- lm(formula = log(full_time_work_prop) ~ confirmed_7dav_cumulative_prop + weekday + SchoolClose + confirmed_7dav_cumulative + GathRestrict + PublicMask + TravelRestrictExit + num.policy.7avg + TravelRestrictEntry + geo_value, data = na.omit(factored_data_without_wd[, -c(1, 24)]))

# Show the summary
summary(fin_mod_without_end)
## 
## Call:
## lm(formula = log(full_time_work_prop) ~ confirmed_7dav_cumulative_prop + 
##     weekday + SchoolClose + confirmed_7dav_cumulative + GathRestrict + 
##     PublicMask + TravelRestrictExit + num.policy.7avg + TravelRestrictEntry + 
##     geo_value, data = na.omit(factored_data_without_wd[, -c(1, 
##     24)]))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.14465 -0.07109 -0.00006  0.08129  0.44707 
## 
## Coefficients:
##                                  Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)                    -2.971e+00  2.153e-02 -138.007  < 2e-16 ***
## confirmed_7dav_cumulative_prop  1.798e-04  3.964e-06   45.353  < 2e-16 ***
## weekdayMonday                   6.275e-02  4.988e-03   12.580  < 2e-16 ***
## weekdayThursday                 1.162e-01  4.945e-03   23.496  < 2e-16 ***
## weekdayTuesday                  1.429e-01  4.946e-03   28.893  < 2e-16 ***
## weekdayWednesday                1.097e-01  4.945e-03   22.178  < 2e-16 ***
## SchoolClose1                   -8.404e-02  5.572e-03  -15.081  < 2e-16 ***
## confirmed_7dav_cumulative      -2.358e-07  2.896e-08   -8.142 4.50e-16 ***
## GathRestrict1                  -5.599e-02  1.118e-02   -5.008 5.63e-07 ***
## PublicMask1                     4.403e-02  5.416e-03    8.130 4.97e-16 ***
## TravelRestrictExit1            -1.636e-01  2.931e-02   -5.583 2.45e-08 ***
## num.policy.7avg                -5.054e-03  2.267e-03   -2.229 0.025833 *  
## TravelRestrictEntry1           -7.670e-02  1.952e-02   -3.930 8.57e-05 ***
## geo_valueal                    -1.262e-01  1.611e-02   -7.831 5.50e-15 ***
## geo_valuear                    -1.134e-01  1.650e-02   -6.872 6.83e-12 ***
## geo_valueaz                    -3.161e-01  1.661e-02  -19.030  < 2e-16 ***
## geo_valueca                    -7.076e-02  1.759e-02   -4.023 5.80e-05 ***
## geo_valueco                    -9.585e-02  1.617e-02   -5.929 3.18e-09 ***
## geo_valuect                    -2.780e-01  1.618e-02  -17.181  < 2e-16 ***
## geo_valuedc                    -3.137e-01  1.675e-02  -18.724  < 2e-16 ***
## geo_valuede                    -2.722e-01  1.644e-02  -16.560  < 2e-16 ***
## geo_valuefl                    -2.522e-01  1.844e-02  -13.677  < 2e-16 ***
## geo_valuega                    -2.408e-01  1.620e-02  -14.864  < 2e-16 ***
## geo_valuehi                    -5.359e-01  1.706e-02  -31.412  < 2e-16 ***
## geo_valueia                    -5.995e-02  1.743e-02   -3.439 0.000587 ***
## geo_valueid                    -9.534e-02  1.617e-02   -5.895 3.92e-09 ***
## geo_valueil                    -2.998e-01  1.614e-02  -18.570  < 2e-16 ***
## geo_valuein                    -1.766e-01  1.597e-02  -11.057  < 2e-16 ***
## geo_valueks                    -1.709e-01  1.786e-02   -9.568  < 2e-16 ***
## geo_valueky                    -6.746e-02  1.712e-02   -3.939 8.24e-05 ***
## geo_valuela                    -2.144e-01  1.632e-02  -13.142  < 2e-16 ***
## geo_valuema                    -2.874e-01  1.634e-02  -17.589  < 2e-16 ***
## geo_valuemd                    -3.354e-01  1.603e-02  -20.923  < 2e-16 ***
## geo_valueme                     3.224e-02  1.734e-02    1.860 0.062972 .  
## geo_valuemi                    -2.167e-01  1.602e-02  -13.521  < 2e-16 ***
## geo_valuemn                    -2.445e-01  1.638e-02  -14.927  < 2e-16 ***
## geo_valuemo                    -1.517e-01  1.911e-02   -7.940 2.31e-15 ***
## geo_valuems                    -1.199e-01  1.629e-02   -7.361 2.01e-13 ***
## geo_valuemt                     2.702e-02  1.799e-02    1.502 0.133178    
## geo_valuenc                    -1.544e-01  1.618e-02   -9.542  < 2e-16 ***
## geo_valuend                    -1.974e-01  1.994e-02   -9.897  < 2e-16 ***
## geo_valuene                    -7.994e-02  1.599e-02   -4.998 5.92e-07 ***
## geo_valuenh                    -1.281e-01  1.611e-02   -7.948 2.16e-15 ***
## geo_valuenj                    -4.027e-01  1.644e-02  -24.495  < 2e-16 ***
## geo_valuenm                    -1.064e-01  1.612e-02   -6.601 4.37e-11 ***
## geo_valuenv                    -3.086e-01  1.602e-02  -19.266  < 2e-16 ***
## geo_valueny                    -3.008e-01  1.760e-02  -17.091  < 2e-16 ***
## geo_valueoh                    -5.006e-02  2.459e-02   -2.036 0.041817 *  
## geo_valueok                    -7.859e-02  1.694e-02   -4.640 3.54e-06 ***
## geo_valueor                    -1.798e-01  1.674e-02  -10.738  < 2e-16 ***
## geo_valuepa                    -1.794e-01  1.607e-02  -11.163  < 2e-16 ***
## geo_valueri                    -2.528e-01  1.649e-02  -15.333  < 2e-16 ***
## geo_valuesc                    -1.463e-01  1.643e-02   -8.902  < 2e-16 ***
## geo_valuesd                    -1.641e-01  1.998e-02   -8.213 2.51e-16 ***
## geo_valuetn                    -1.845e-01  1.601e-02  -11.523  < 2e-16 ***
## geo_valuetx                    -2.321e-01  1.694e-02  -13.703  < 2e-16 ***
## geo_valueut                    -7.241e-02  1.599e-02   -4.529 6.01e-06 ***
## geo_valueva                    -2.420e-01  1.685e-02  -14.365  < 2e-16 ***
## geo_valuevt                     1.263e-02  1.588e-02    0.795 0.426629    
## geo_valuewa                    -2.382e-01  1.597e-02  -14.912  < 2e-16 ***
## geo_valuewi                    -1.667e-01  1.894e-02   -8.802  < 2e-16 ***
## geo_valuewv                     3.222e-03  1.610e-02    0.200 0.841358    
## geo_valuewy                    -8.336e-02  1.629e-02   -5.119 3.15e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1368 on 7536 degrees of freedom
## Multiple R-squared:  0.6427, Adjusted R-squared:  0.6397 
## F-statistic: 218.6 on 62 and 7536 DF,  p-value: < 2.2e-16

ANOVA

Observations

Without dropping the weekends, from the selected additive model, we see that there are multiple covidcast signals that account for most of the variation of the mean mobility signal(squared-root) at 0.05 significance level. These are:

  • confirmed_7dav_cumulative_prop
  • smoothed_cli
  • smoothed_adj_cli

Also, among all the interventions, SchoolClose is the most significant in explaining the variation of mobility. If we are to compare variables that have negative coefficient in the regression model. We see that smoothed_cli accounts slightly more variation of mobility than SchoolClose in terms of regression sum of squares.

On the other hand, after we drop the weekends in the data, we notice that confirmed_7dav_cumulative_prop and SchoolClose remain significant to explain the difference in mean of log mobility signal.

# Show the summary and compare the models
anova(fin_mod) # with weeekends
## Analysis of Variance Table
## 
## Response: full_time_work_prop^(1/2)
##                                   Df Sum Sq Mean Sq   F value    Pr(>F)    
## confirmed_7dav_cumulative_prop     1 1.1352 1.13521 1851.1170 < 2.2e-16 ***
## smoothed_cli                       1 0.3661 0.36608  596.9489 < 2.2e-16 ***
## smoothed_adj_cli                   1 1.7796 1.77957 2901.8461 < 2.2e-16 ***
## SchoolClose                        1 0.4040 0.40400  658.7738 < 2.2e-16 ***
## confirmed_7dav_incidence_num       1 0.0023 0.00226    3.6794   0.05512 .  
## confirmed_7dav_cumulative          1 0.0209 0.02087   34.0347 5.574e-09 ***
## TravelRestrictExit                 1 0.0021 0.00207    3.3702   0.06641 .  
## GathRestrict                       1 0.0008 0.00080    1.3038   0.25355    
## geo_value                         50 1.2623 0.02525   41.1678 < 2.2e-16 ***
## Residuals                      10498 6.4379 0.00061                        
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(fin_mod_without_end) # without weekends
## Analysis of Variance Table
## 
## Response: log(full_time_work_prop)
##                                  Df  Sum Sq Mean Sq  F value    Pr(>F)    
## confirmed_7dav_cumulative_prop    1 111.911 111.911 5982.063 < 2.2e-16 ***
## weekday                           4  18.914   4.729  252.758 < 2.2e-16 ***
## SchoolClose                       1  25.395  25.395 1357.452 < 2.2e-16 ***
## confirmed_7dav_cumulative         1   4.948   4.948  264.503 < 2.2e-16 ***
## GathRestrict                      1   0.993   0.993   53.059 3.565e-13 ***
## PublicMask                        1   2.476   2.476  132.371 < 2.2e-16 ***
## TravelRestrictExit                1   0.237   0.237   12.680 0.0003718 ***
## num.policy.7avg                   1   0.226   0.226   12.065 0.0005168 ***
## TravelRestrictEntry               1   0.478   0.478   25.560 4.390e-07 ***
## geo_value                        50  88.002   1.760   94.081 < 2.2e-16 ***
## Residuals                      7536 140.982   0.019                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Conclusion

In conclusion, at 0.05 significance level, confirmed_7dav_cumulative_prop is highly significant in explaining the variation of mobility in both with or without weekends data. SchoolClose is also significant in both situations. When we look at the model with weekend data, covidcast signals account for most of the variation of mobility. More importantly, for the variables that have negative coefficient, smoothed_cli appear to account for more variation than SchoolClose. However, as we remove the weekends from the data, smoothed_cliis no longer included in the model and SchoolClose remain being sigificant in explaining the variation of mobility. Also, PublicMask,TravelRestrictExit, GathRestrict, and `TravelRestrictEntry `have become significant in the ANOVA model without weekends data.